Draft for the section general stats

Parameters

Inherited parameter and data from the import tab.

filtered_data <- read.csv("../data/data-2023-09-11 (2).csv", header = TRUE)

selected_stats <- c("Ho", "Hs", "Ht", "Fis (W&C)", "Fst (W&C)", "Fis (Nei)", "Fst (Nei)")

n_rep = 1000
n_marker = 6
n_pop = 8

sequence_length <- length(6:11)

num_cores <- 16

data filtering convertion

library(hierfstat)
library(adegenet)
filtered_data <- data.frame(indv = paste(substr(filtered_data$Population, 1, 3),
    row.names(filtered_data), sep = "."), filtered_data)
# Create mydata_genind
population <- filtered_data$Population
mydata_genind <- df2genind(X = filtered_data[, 6:11], sep = "/", ncode = 6, ind.names = filtered_data$indv,
    pop = filtered_data$Population, NA.char = "0/0", ploidy = 2, type = "codom",
    strata = NULL, hierarchy = NULL)
mydata_genind
mydata_hierfstat <- genind2hierfstat(mydata_genind)

Missing data

# Libraries
library("poppr")
## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4
## This is poppr version 2.9.4. To get started, type package?poppr
## OMP parallel support: available
library("heatmaply")
## Loading required package: plotly
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan
## 
## ======================
## Welcome to heatmaply version 1.5.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
missing_data <- info_table(mydata_genind, plot = FALSE)

# Matrix format
mat <- as.matrix(missing_data)
# heatmap
p <- heatmaply(mat, dendrogram = "none", xlab = "", ylab = "", main = "", scale = "column",
    margins = c(60, 100, 40, 20), grid_color = "white", grid_width = 1e-05, titleX = FALSE,
    hide_colorbar = TRUE, branches_lwd = 0.1, label_names = c("Population", "Marker",
        "Value"), fontsize_row = 8, fontsize_col = 8, labCol = colnames(mat), labRow = rownames(mat),
    heatmap_layers = theme(axis.line = element_blank()))
p

Run basic.stats and render the result

library(pegas)
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:hierfstat':
## 
##     pcoa, varcomp
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## The following object is masked from 'package:ade4':
## 
##     amova
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:ape':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)

result <- basic.stats(mydata_hierfstat)  #hierfstat
df_resutl_basic <- as.data.frame(result$perloc)

# Weir and Cockrham estimates of Fstatistics - FIS and FST
result_f_stats <- Fst(as.loci(mydata_genind), na.alleles = "")  #pegas
result_f_stats <- result_f_stats[, 2:3]
colnames(result_f_stats) <- c("Fst (W&C)", "Fis (W&C)")
result_f_stats <- merge(result_f_stats, df_resutl_basic, by = "row.names", all.x = TRUE)
colnames(result_f_stats)[10] <- "Fst (Nei)"
colnames(result_f_stats)[12] <- "Fis (Nei)"
result_f_stats <- result_f_stats %>%
    column_to_rownames(., var = "Row.names")
result_f_stats_selec <- result_f_stats %>%
    select(all_of(selected_stats))
result_f_stats_selec

missing data / FIS

library(hrbrthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ stringr   1.5.0
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::where()  masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
a <- result_f_stats_selec %>%
    select("Fis (W&C)")
b <- as_tibble(mat) %>%
    filter(row_number() == n()) %>%
    rownames_to_column %>%
    gather(variable, value, -rowname) %>%
    spread(rowname, value) %>%
    slice(1:(n() - 1)) %>%
    column_to_rownames("variable")
colnames(b) <- ("Missing %")
c <- merge(a, b, by = "row.names", all.x = TRUE) %>%
    column_to_rownames("Row.names")

# Plot with linear trend
p1 <- ggplot(c, aes(x = `Missing %`, y = `Fis (W&C)`)) + geom_point() + theme_ipsum()
p1

G-statistic

library(ggplot2)

# compute the Gstats
result_f_stats <- result_f_stats %>%
    mutate(GST = 1 - Hs/Ht)
result_f_stats <- result_f_stats %>%
    mutate(`GST''` = (n_pop * (Ht - Hs))/((n_pop * Hs - Ht) * (1 - Hs)))
result_f_stats
# Plot with linear trend
p2 <- ggplot(result_f_stats, aes(x = GST, y = Hs)) + geom_point() + geom_smooth(method = lm,
    color = "red", se = FALSE) + theme_ipsum()
p2
## `geom_smooth()` using formula = 'y ~ x'

HW - Panmixia

with pegas

library("pegas")
hw.test(as.loci(mydata_genind), B = n_rep)
##         chi^2 df Pr(chi^2 >) Pr.exact
## B12 693.49866 15   0.0000000    0.000
## C07  27.78220 28   0.4760334    0.104
## D12 799.33270 66   0.0000000    0.000
## D10 492.53039 28   0.0000000    0.000
## A12  11.89948 10   0.2918403    0.225
## C03  56.70154 36   0.0153672    0.099

with genepop

library(radiator)
# https://thierrygosselin.github.io/radiator/reference/genomic_converter.html
# https://thierrygosselin.github.io/radiator/articles/rad_genomics_computer_setup.html

# mydata1 <- genomic_converter(mydata_genind, parallel.core =
# parallel::detectCores() - 1, output = 'genepop', filename =
# 'mydata.genepop.txt')

library(genepop)
# https://cran.r-project.org/web/packages/genepop/genepop.pdf genepop_HW <-
# test_HW( inputFile =
# '05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt', which =
# 'Proba', outputFile = '', settingsFile = '', enumeration = FALSE,
# dememorization = 10000, batches = 20, iterations = 5000, verbose =
# interactive() )

Linkage Disequilibrium

I have found different values from Fstats.

with poppr

The index of association was originally developed by A.H.D. Brown analyzing population structure of wild barley (Brown, 1980).

Ia: The index of association. p.Ia: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call. rbard: The standardized index of association. p.rd: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call.

pair.ia calculates the index of association in a pairwise manner among all loci.

method = 1: Permute Alleles This will redistribute all alleles in the sample throughout the locus. Missing data is fixed in place. This maintains allelic structure, but heterozygosity is variable.

method = 4: Multilocus Style Permutation This will shuffle the genotypes at each locus, maintaining the heterozygosity and allelic structure.

loci_pair <- pair.ia(mydata_genind, sample = n_rep, quiet = FALSE, method = 1,plot = FALSE)
loci_pair
##              Ia    p.Ia   rbarD    p.rD
## B12:C07 -0.0039  0.9901 -0.0039  0.9901
## B12:D12  0.0545  0.8317  0.0547  0.8317
## B12:D10  0.0204  0.2079  0.0204  0.2079
## B12:A12  0.0356  0.0099  0.0356  0.0099
## B12:C03  0.0531  0.1683  0.0534  0.1782
## C07:D12 -0.0233  0.8317 -0.0233  0.8317
## C07:D10  0.0172  0.4059  0.0172  0.4059
## C07:A12  0.0364  0.5941  0.0365  0.5941
## C07:C03  0.1252  0.0693  0.1254  0.0693
## D12:D10  0.0708  0.0099  0.0709  0.0198
## D12:A12  0.0244  0.1584  0.0245  0.1584
## D12:C03 -0.0135  0.6139 -0.0135  0.6139
## D10:A12  0.0170  0.3465  0.0170  0.3465
## D10:C03  0.0456  0.1089  0.0458  0.1089
## A12:C03  0.0284  0.0990  0.0287  0.0990

with genepop

“Genotypes at one locus are independent from genotypes at the other locus”. For a pair of diploid loci, no assumption is made about the gametic phase in double heterozygotes. In particular, it is not inferred assuming one-locus HW equilibrium, as such equilibrium is not assumed anywhere in the formulation of the test. The test is thus one of association between diploid genotypes at both loci, sometimes described as a test of the composite linkage disequilibrium (Bruce S. Weir 1996, 126–28). Contingency tables are created for all pairs of loci in each sample, then a G test or a probability test for each table is computed for each table using the Markov chain algorithm of Raymond and Rousset (1995a). The number of switches of the algorithm is given for each table analyzed.

library(genepop)
outfile <- test_LD(inputFile = "05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt",
    outputFile = "", settingsFile = "", dememorization = 10000, batches = 100, iterations = n_rep,
    verbose = interactive())
readLines(outfile)[136:155]
##  [1] "P-value for each locus pair across all populations"   
##  [2] "(Fisher's method)"                                    
##  [3] "-----------------------------------------------------"
##  [4] "Locus pair                    Chi2      df   P-Value" 
##  [5] "--------------------          --------  ---  --------"
##  [6] "A12           & B12           9.655835  16   0.883973"
##  [7] "A12           & C03           22.903568 16   0.116337"
##  [8] "B12           & C03           20.823728 16   0.185386"
##  [9] "A12           & C07           8.267152  16   0.940512"
## [10] "B12           & C07           15.309280 16   0.502114"
## [11] "C03           & C07           23.809203 16   0.093755"
## [12] "A12           & D10           14.850121 16   0.535642"
## [13] "B12           & D10           9.224176  16   0.903895"
## [14] "C03           & D10           19.495390 16   0.243812"
## [15] "C07           & D10           12.347709 16   0.719717"
## [16] "A12           & D12           11.357848 16   0.786877"
## [17] "B12           & D12           12.699920 16   0.694559"
## [18] "C03           & D12           5.696802  16   0.991052"
## [19] "C07           & D12           31.371040 16   0.012061"
## [20] "D10           & D12           14.675705 16   0.548506"

with pegas

LD2 is based on the observed frequencies of different genotypes (Schaid 2004).

mat_alleles <- filtered_data %>%
    select(Population)
mat_alleles <- cbind(mat_alleles, filtered_data[, 6:11])
mat_alleles_loci <- alleles2loci(mat_alleles, ploidy = 1, population = 1, phased = FALSE)
linkage_pegas <- LD2(mat_alleles_loci)
print(linkage_pegas$T2)
##          T2          df       P-val 
## 69.43451606 48.00000000  0.02313167

Panmixia - HW

library(poppr)
library(pegas)
library(dplyr)
library(tibble)
# Permute Alleles This will redistribute all alleles in the sample throughout
# the locus. Missing data is fixed in place. #This maintains allelic structure,
# but heterozygosity is variable.


es <- replicate(n_rep, (shufflepop(mydata_genind, method = 1)))
fis_df <- numeric(sequence_length)

# for (i in 1:n_rep){ # Calculate the statistics for the i-th matrix
# result_fis_stats <- as_tibble(Fst(as.loci(es[[i]]))) %>% select('Fis' ) #
# Assign values to the data frames fis_df <- cbind(fis_df,
# result_fis_stats$Fis) }

library(foreach)
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Create a list to store the results
fis_list <- foreach(i = 1:n_rep, .combine = "c") %dopar% {
    library(dplyr)
    library(pegas)
    # Calculate the statistics for the i-th matrix
    result_fis_stats <- as_tibble(Fst(as.loci(es[[i]]))) %>%
        select(Fis)
    return(result_fis_stats$Fis)
}

# Combine the results into a matrix
fis_df <- matrix(unlist(fis_list), nrow = n_rep, byrow = TRUE)
fis_df <- t(fis_df)
# Stop the parallel processing cluster
stopCluster(cl)

# Set row names as in result_f_stats
rownames(fis_df) <- rownames(result_f_stats)
# fis_df <-fis_df[, -1]
vec <- seq(1, n_rep)
colnames(fis_df) <- vec


# Initialize an empty data frame to store the counts
fis_df_Greater <- numeric(sequence_length)
fis_df_Smaller <- numeric(sequence_length)

# Compare the values in result_f_stats[1] to result_FST for each column
for (col in colnames(fis_df)) {
    greater_count <- as_tibble(result_f_stats[2] > fis_df[, col])
    smaller_count <- as_tibble(result_f_stats[2] < fis_df[, col])
    fis_df_Greater <- cbind(fis_df_Greater, greater_count$`Fis (W&C)`)
    fis_df_Smaller <- cbind(fis_df_Smaller, smaller_count$`Fis (W&C)`)
}

# Set row names as in result_f_stats
rownames(fis_df_Smaller) <- rownames(fis_df_Greater) <- rownames(result_f_stats)
fis_df_Smaller <- fis_df_Smaller[, -1]
fis_df_Greater <- fis_df_Greater[, -1]
vec <- seq(1, n_rep)
colnames(fis_df_Smaller) <- colnames(fis_df_Greater) <- vec


fis_df_Smaller_av <- as.data.frame(fis_df_Smaller) %>%
    mutate(average = rowSums(across(where(is.numeric)))/n_rep) %>%
    select(average)
fis_df_Greater_av <- as.data.frame(fis_df_Greater) %>%
    mutate(average = rowSums(across(where(is.numeric)))/n_rep) %>%
    select(average)

rownames(fis_df_Smaller_av) <- rownames(fis_df_Greater_av) <- rownames(result_f_stats)

fis_df_sg <- merge(fis_df_Smaller_av, fis_df_Greater_av, by = "row.names", all.x = TRUE) %>%
    column_to_rownames("Row.names")


colnames(fis_df_sg) <- c("smaller", "greater")

fis_df_sg_t <- fis_df_sg %>%
    mutate(`Two-sided p-values` = ifelse(smaller > 0.5, 2 * (1 - smaller), 2 * smaller))

# standard deviation SE Calculate the standard deviation for each element in
# fis_df
std_dev <- as.data.frame(apply(fis_df, 1, sd))
colnames(std_dev) <- ("std_dev")

################################# Calculate SE
################################# ################################# join df
std_dev <- std_dev %>%
    mutate(SE = std_dev/sqrt(n_rep) * 4)
rownames(std_dev) <- c(rownames(fis_df_sg_t))

df_merge <- merge(std_dev, fis_df_sg_t, by = "row.names", all.x = TRUE) %>%
    column_to_rownames("Row.names")
df_merge <- merge(df_merge, result_f_stats[2], by = "row.names", all.x = TRUE) %>%
    column_to_rownames("Row.names")
df_merge <- df_merge %>%
    mutate(t = qt(1 - 0.05/2, n_pop - 1)) %>%
    mutate(`95%CI_i` = `Fis (W&C)` - t * SE) %>%
    mutate(`95%CI_s` = `Fis (W&C)` + t * SE) %>%
    mutate()
# plot the point plot
library(ggplot2)

# Create a ggplot using df_merge
p <- ggplot(df_merge, aes(x = rownames(df_merge), y = `Fis (W&C)`)) + geom_point() +
    geom_errorbar(aes(ymin = `95%CI_i`, ymax = `95%CI_s`), width = 0.2, position = position_dodge(0.05)) +
    xlab("Category") + ylab("Fis (W&C)")

# Rotate X-axis labels for better readability
p + theme(axis.text.x = element_text(angle = 45, hjust = 1))

DEVELOPMENT

not ready for deployment

File export

library(radiator)

genomic_converter(
  data,
  strata = NULL,
  output = NULL,
  filename = NULL,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
)

library(hierfstat)
write.fstat
write.struct